import os, os.path, sys
import re, urllib

class TitleIndexer:
    '''an HTML index page producer for a set of HTML files with titles.'''
    def __init__(self, top_dir, out_dir):

        if not os.path.isdir(top_dir):
            print top_dir, "should be a directory name."
            usage()
            sys.exit(0)
        # remove possible path delimiter "/" or "\" in the end
        self.top_dir = os.path.normpath(top_dir)
        self.out_dir = out_dir

        # regular expression to extract the content of the title
        self.title_re = re.compile("<title>(.*?)<\/title>",re.DOTALL|re.M)

        # a mapping between initials and a list of titles, e.g.:
        # {'a' -> ["alabama", "alaska", ...], 'b' -> ...}
        self.initial2title = {}
        # a mapping between titles and a list of relative file names, e.g.:
        # {"alabama" -> ["00/23453.html", ...], "alaska" -> [ ], ... }
        self.title2path = {}

        return

    def run(self):
        for dirpath, dirnames, filenames in os.walk(self.top_dir):
            print dirpath
            for file in filenames:
                filepath = os.path.join(dirpath,  file)
                # relative file path with respect to but excluding top_dir
                relative_filepath = filepath[len(self.top_dir)+1:]
                title = self.get_title(filepath)
                first_char = title[0].upper()

                # a mapping between initials and titles
                if not self.initial2title.has_key(first_char):
                    self.initial2title[first_char] = []
                self.initial2title[first_char].append(title)

                # a mapping between titles and file names
                if not self.title2path.has_key(title):
                    self.title2path[title] = []
                self.title2path[title].append(relative_filepath)
                #print relative_filepath, title

        for title_list in self.initial2title.values():
            title_list.sort()
        for file_list in self.title2path.values():
            if len(file_list) > 0:
                file_list.sort()

        self.output_html()
        #print "initial2title", self.initial2title
        #print "title2path", self.title2path

    def get_title(self, filepath):
        text = open(filepath).read()
        return urllib.unquote(self.title_re.findall(text)[0])

    def output_html(self):
        if not os.path.exists(self.out_dir):
            os.mkdir(self.out_dir)
        f = open(self.out_dir+os.sep+"all.html", "w")
        html_start = """
<!DOCTYPE html>
<HTML lang="en">
<HEAD>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<TITLE>
Index Generated by TitleIndexer
</TITLE>
</HEAD>
<HR>
"""
        html_end = """
</BODY>
</HTML>
        """

        sorted_initial_list = sorted(self.initial2title.iterkeys())
        # Generate all initials
        tmp_initial = '<A HREF="#_%s_">%s</A>\n'
        html_initial = ''
        for initial in sorted_initial_list:
            html_initial += tmp_initial % (initial, initial)
        html_initial += "<HR>"

        # Generate all titles per initials
        tmp_title_initial = '<A ID="_%s_"></A>\n<H2>%s</H2>\n'
        tmp_title = '<A HREF="articles/%s">%s</A><BR>\n'
        html_body = ''
        initial_body = ''
        for initial in sorted_initial_list:
            # a.html, b.html, ...
            i = open(self.out_dir+os.sep+initial+".html", "w")

            initial_body = tmp_title_initial % (initial, initial)
            html_body += initial_body
            for title in self.initial2title[initial]:
                for file_name in self.title2path[title]:
                    initial_body += tmp_title % (file_name, title)
                    html_body += tmp_title % (file_name, title)
            html_body += "<HR>"

            i.write(html_start)
            i.write(initial_body)
            i.write(html_end)
            i.close()

        f.write(html_start)
        f.write(html_initial)
        f.write(html_body)
        f.write(html_initial)
        f.write(html_end)
        f.close()


def usage():
    print "Usage: python" , "TitleIndexer.py" , "in_dir out_dir"
    print "        in_dir: where all the input html files are stored"
    print "        out_dir: where the generated indices are stored"

if __name__=='__main__':
    if len(sys.argv) != 3 or sys.argv[1] == "-h" or sys.argv[2] == "--help":
        usage()
        sys.exit(0)
    indexer = TitleIndexer(sys.argv[1], sys.argv[2])
    indexer.run()
